import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
import pandas as pd
import plotly.express as px
from collections import Counter
from IPython.display import display, Markdown
import statsmodels.api as sm
import numpy as np
import scipy.stats as stats
from sem_covid.services.data_registry import Dataset
CATEGORICAL_COLUMNS = ['country', 'category', 'subcategory','actors', 'target_groups','Businesses','Citizens','Workers', 'funding']
BUSINESSES ={'Companies providing essential services', 'Contractors of a company', 'Larger corporations', 'One person or microenterprises', 'Other businesses', 'SMEs', 'Sector specific set of companies', 'Solo-self-employed', 'Start-ups'}
CITIZENS ={'Children (minors)', 'Disabled', 'Migrants', 'Older citizens', 'Other groups of citizens', 'Parents', 'People in care facilities', 'Refugees', 'Single parents', 'The COVID-19 risk group', 'Women', 'Youth (18-25)'}
WORKERS = {'Cross-border commuters', 'Disabled workers', 'Employees in standard employment', 'Female workers', 'Migrants in employment', 'Older people in employment (aged 55+)', 'Other groups of workers', 'Parents in employment', 'Particular professions', 'Platform workers', 'Posted workers', 'Refugees in employment', 'Seasonal workers', 'Self-employed', 'Single parents in employment', 'The COVID-19 risk group at the workplace', 'Undeclared workers', 'Unemployed', 'Workers in care facilities', 'Workers in essential services', 'Workers in non-standard forms of employment', 'Youth (18-25) in employment'}
df = Dataset.PWDB.fetch()
100% (1288 of 1288) |####################| Elapsed Time: 0:00:00 Time: 0:00:00
def target_group_refactoring(pwdb_dataframe: pd.DataFrame,target_group_column_name: str = 'target_groups') -> pd.DataFrame:
"""
The target group available in the original dataset is very granular. For the purpose of this exercise
we would benefit from aggregating the target groups into a more generic sets. As a result we will obtain
target groups on two levels: L1, L2.
L1: workers, businesses, citizens
L2: the original set of categories
:return: the given dataset with an extra column containing the aggregated (L1) values
"""
new_columns = {'Businesses':BUSINESSES,'Citizens':CITIZENS,'Workers':WORKERS}
refactored_pwdb_df = pwdb_dataframe[target_group_column_name]
for column,class_set in new_columns.items():
pwdb_dataframe[column] = refactored_pwdb_df.apply(lambda x: any(item in class_set for item in x))
pwdb_dataframe[column].replace({True: column+"_True", False: column+"_False"}, inplace=True)
return pwdb_dataframe
df = target_group_refactoring(df)
def plot_bar_chart(observations: pd.DataFrame,chart_title: str):
columns = observations.columns
return px.bar(observations,x=columns[1],y=columns[0],title=chart_title)
def plot_pie_chart(observations: pd.DataFrame,chart_title: str):
columns = observations.columns
return px.pie(observations,values=columns[1],names=columns[0],title=chart_title )
def calc_freq_categorical_data(data : pd.Series,title : str ,relative : bool = False ):
observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
data.dropna(inplace=True)
observation = pd.DataFrame(Counter(data).most_common(),columns=[title,observation_type_name])
if relative:
observation[observation_type_name]/=observation[observation_type_name].sum()/100
observation[observation_type_name]=round(observation[observation_type_name],2)
return observation
def calc_freq_missing_data(data : pd.DataFrame,relative : bool = False):
observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
columns = data.columns
tmp = pd.Series(dtype=object)
for column in columns:
series_tmp = data[column].explode()
tmp[column]= series_tmp.isnull().sum()
if relative:
tmp[column]/=series_tmp.size/100
tmp[column]=round(tmp[column],2)
observation = pd.DataFrame(tmp[tmp>0] ,columns=[observation_type_name])
observation.reset_index(inplace=True)
return observation
def fast_categorical_analyze(data : pd.DataFrame,data_title : str = 'Unknown'):
results = {}
abs_miss_obs = calc_freq_missing_data(data)
display(abs_miss_obs)
if abs_miss_obs.size>0:
plot_pie_chart(abs_miss_obs,data_title+' missing values').show()
data = data[CATEGORICAL_COLUMNS]
for column_name in data.columns:
data_column = data[column_name].explode()
try:
rel_obs = calc_freq_categorical_data(data_column,column_name,True)
results[column_name] = rel_obs
rel_obs = rel_obs.head(10)
display(rel_obs)
plot_bar_chart(rel_obs,column_name).show()
plot_pie_chart(rel_obs,column_name).show()
except:
print('Observation on [',column_name,'] fault!')
print('Check if column [',column_name,'] have compatible type!')
return results
eda_result = fast_categorical_analyze(df,"PWDB Dataset")
| index | Absolute freq | |
|---|---|---|
| 0 | end_date | 511 |
| 1 | target_groups | 340 |
| 2 | social_partner_involvement_form | 1288 |
| 3 | social_partner_role | 1288 |
| 4 | sectors | 797 |
| 5 | occupations | 1165 |
| 6 | sources | 1 |
| country | Relative freq | |
|---|---|---|
| 0 | Spain | 6.99 |
| 1 | Italy | 5.43 |
| 2 | Greece | 4.74 |
| 3 | Germany | 4.66 |
| 4 | Austria | 4.58 |
| 5 | Portugal | 4.50 |
| 6 | Lithuania | 3.57 |
| 7 | France | 3.57 |
| 8 | Czechia | 3.49 |
| 9 | Norway | 3.42 |
| category | Relative freq | |
|---|---|---|
| 0 | Supporting businesses to stay afloat | 29.66 |
| 1 | Protection of workers, adaptation of workplace | 14.36 |
| 2 | Income protection beyond short-time work | 11.65 |
| 3 | Promoting the economic, labour market and soci... | 10.09 |
| 4 | Employment protection and retention | 9.32 |
| 5 | Ensuring business continuity and support for e... | 8.77 |
| 6 | Measures to prevent social hardship | 7.14 |
| 7 | Reorientation of business activities | 5.59 |
| 8 | Supporting businesses to get back to normal | 3.42 |
| subcategory | Relative freq | |
|---|---|---|
| 0 | Direct subsidies (full or partial) | 14.36 |
| 1 | Access to finance | 8.00 |
| 2 | Income support for people in employment (e.g. ... | 7.22 |
| 3 | Deferral of payments or liabilities | 5.36 |
| 4 | Other | 5.28 |
| 5 | Occupational health and safety | 5.12 |
| 6 | Active labour market policies, incl. subsidise... | 4.66 |
| 7 | Extensions of income support to workers not c... | 4.58 |
| 8 | Change of production/innovation | 4.50 |
| 9 | Teleworking arrangements, remote working | 4.35 |
| actors | Relative freq | |
|---|---|---|
| 0 | National government | 38.76 |
| 1 | Company / Companies | 18.58 |
| 2 | Trade unions | 7.63 |
| 3 | Employers' organisations | 7.25 |
| 4 | Social partners jointly | 6.28 |
| 5 | Local / regional government | 5.43 |
| 6 | Public employment service | 4.41 |
| 7 | Social insurance | 4.03 |
| 8 | Other social actors (e.g. NGOs) | 3.05 |
| 9 | Public support service providers | 2.88 |
| target_groups | Relative freq | |
|---|---|---|
| 0 | Employees in standard employment | 15.36 |
| 1 | Sector specific set of companies | 14.43 |
| 2 | SMEs | 6.97 |
| 3 | Self-employed | 6.53 |
| 4 | Particular professions | 6.41 |
| 5 | Unemployed | 4.79 |
| 6 | One person or microenterprises | 4.73 |
| 7 | Solo-self-employed | 3.48 |
| 8 | Other businesses | 3.11 |
| 9 | Other groups of workers | 3.11 |
| Businesses | Relative freq | |
|---|---|---|
| 0 | Businesses_False | 64.52 |
| 1 | Businesses_True | 35.48 |
| Citizens | Relative freq | |
|---|---|---|
| 0 | Citizens_False | 93.09 |
| 1 | Citizens_True | 6.91 |
| Workers | Relative freq | |
|---|---|---|
| 0 | Workers_False | 54.11 |
| 1 | Workers_True | 45.89 |
| funding | Relative freq | |
|---|---|---|
| 0 | National funds | 47.79 |
| 1 | No special funding required | 18.22 |
| 2 | Companies | 10.21 |
| 3 | European Funds | 8.76 |
| 4 | Employer | 4.48 |
| 5 | Regional funds | 3.34 |
| 6 | Local funds | 1.70 |
| 7 | Other | 1.70 |
| 8 | Employers organisation | 1.58 |
| 9 | Employees | 1.13 |
for key in eda_result.keys():
data = eda_result[key].copy()
column_name = data.columns[1]
zscore_column = data.columns[0]+'_z_score'
cumulative_freq = 'Cumulative freq'
diff_freq = 'Diff freq'
data[zscore_column] = stats.zscore(data[column_name])
data[zscore_column] = data[zscore_column].apply(lambda x: round(x,2))
data[cumulative_freq] = data[column_name].cumsum()
data[diff_freq] = data[column_name].diff()
display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
display(data)
px.bar(data,x=data.columns[2],y=data.columns[0]).show()
px.bar(data,x=data.columns[0],y=data.columns[3]).show()
px.bar(data,x=data.columns[0],y=data.columns[4]).show()
Std deviation for [country] is [1.13]
| country | Relative freq | country_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Spain | 6.99 | 3.30 | 6.99 | NaN |
| 1 | Italy | 5.43 | 1.89 | 12.42 | -1.56 |
| 2 | Greece | 4.74 | 1.27 | 17.16 | -0.69 |
| 3 | Germany | 4.66 | 1.20 | 21.82 | -0.08 |
| 4 | Austria | 4.58 | 1.13 | 26.40 | -0.08 |
| 5 | Portugal | 4.50 | 1.05 | 30.90 | -0.08 |
| 6 | Lithuania | 3.57 | 0.21 | 34.47 | -0.93 |
| 7 | France | 3.57 | 0.21 | 38.04 | 0.00 |
| 8 | Czechia | 3.49 | 0.14 | 41.53 | -0.08 |
| 9 | Norway | 3.42 | 0.08 | 44.95 | -0.07 |
| 10 | Ireland | 3.42 | 0.08 | 48.37 | 0.00 |
| 11 | Croatia | 3.34 | 0.01 | 51.71 | -0.08 |
| 12 | Belgium | 3.34 | 0.01 | 55.05 | 0.00 |
| 13 | Poland | 3.26 | -0.07 | 58.31 | -0.08 |
| 14 | Finland | 3.11 | -0.20 | 61.42 | -0.15 |
| 15 | Luxembourg | 3.03 | -0.27 | 64.45 | -0.08 |
| 16 | Netherlands | 3.03 | -0.27 | 67.48 | 0.00 |
| 17 | Malta | 3.03 | -0.27 | 70.51 | 0.00 |
| 18 | Denmark | 2.95 | -0.35 | 73.46 | -0.08 |
| 19 | Slovenia | 2.95 | -0.35 | 76.41 | 0.00 |
| 20 | Cyprus | 2.87 | -0.42 | 79.28 | -0.08 |
| 21 | Latvia | 2.87 | -0.42 | 82.15 | 0.00 |
| 22 | Romania | 2.72 | -0.55 | 84.87 | -0.15 |
| 23 | Slovakia | 2.72 | -0.55 | 87.59 | 0.00 |
| 24 | United Kingdom | 2.56 | -0.70 | 90.15 | -0.16 |
| 25 | Sweden | 2.41 | -0.83 | 92.56 | -0.15 |
| 26 | Bulgaria | 2.25 | -0.98 | 94.81 | -0.16 |
| 27 | Hungary | 1.94 | -1.26 | 96.75 | -0.31 |
| 28 | European Union | 1.71 | -1.47 | 98.46 | -0.23 |
| 29 | Estonia | 1.55 | -1.61 | 100.01 | -0.16 |
Std deviation for [category] is [7.67]
| category | Relative freq | category_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Supporting businesses to stay afloat | 29.66 | 2.57 | 29.66 | NaN |
| 1 | Protection of workers, adaptation of workplace | 14.36 | 0.45 | 44.02 | -15.30 |
| 2 | Income protection beyond short-time work | 11.65 | 0.07 | 55.67 | -2.71 |
| 3 | Promoting the economic, labour market and soci... | 10.09 | -0.14 | 65.76 | -1.56 |
| 4 | Employment protection and retention | 9.32 | -0.25 | 75.08 | -0.77 |
| 5 | Ensuring business continuity and support for e... | 8.77 | -0.32 | 83.85 | -0.55 |
| 6 | Measures to prevent social hardship | 7.14 | -0.55 | 90.99 | -1.63 |
| 7 | Reorientation of business activities | 5.59 | -0.76 | 96.58 | -1.55 |
| 8 | Supporting businesses to get back to normal | 3.42 | -1.06 | 100.00 | -2.17 |
Std deviation for [subcategory] is [2.83]
| subcategory | Relative freq | subcategory_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Direct subsidies (full or partial) | 14.36 | 4.15 | 14.36 | NaN |
| 1 | Access to finance | 8.00 | 1.87 | 22.36 | -6.36 |
| 2 | Income support for people in employment (e.g. ... | 7.22 | 1.59 | 29.58 | -0.78 |
| 3 | Deferral of payments or liabilities | 5.36 | 0.93 | 34.94 | -1.86 |
| 4 | Other | 5.28 | 0.90 | 40.22 | -0.08 |
| 5 | Occupational health and safety | 5.12 | 0.84 | 45.34 | -0.16 |
| 6 | Active labour market policies, incl. subsidise... | 4.66 | 0.68 | 50.00 | -0.46 |
| 7 | Extensions of income support to workers not c... | 4.58 | 0.65 | 54.58 | -0.08 |
| 8 | Change of production/innovation | 4.50 | 0.62 | 59.08 | -0.08 |
| 9 | Teleworking arrangements, remote working | 4.35 | 0.56 | 63.43 | -0.15 |
| 10 | Protection of vulnerable groups (beyond employ... | 2.95 | 0.06 | 66.38 | -1.40 |
| 11 | Support for spending, stimulus packages | 2.64 | -0.05 | 69.02 | -0.31 |
| 12 | Change of work arrangements (working time, rot... | 2.64 | -0.05 | 71.66 | 0.00 |
| 13 | Support for parents and carers (financial or i... | 2.64 | -0.05 | 74.30 | 0.00 |
| 14 | Remuneration and rewards for workers in essent... | 2.64 | -0.05 | 76.94 | 0.00 |
| 15 | Paid sick leave | 2.33 | -0.16 | 79.27 | -0.31 |
| 16 | Well-being of workers | 2.10 | -0.24 | 81.37 | -0.23 |
| 17 | Keeping a safe home | 2.02 | -0.27 | 83.39 | -0.08 |
| 18 | Income support for unemployed | 1.94 | -0.30 | 85.33 | -0.08 |
| 19 | Mobilisation of a larger workforce | 1.63 | -0.41 | 86.96 | -0.31 |
| 20 | Enhancing employability and training | 1.55 | -0.44 | 88.51 | -0.08 |
| 21 | Smoothing frictions or reallocation of workers | 1.48 | -0.47 | 89.99 | -0.07 |
| 22 | Measures to support a gradual relaunch of work | 1.48 | -0.47 | 91.47 | 0.00 |
| 23 | Flexibilisation and security | 1.16 | -0.58 | 92.63 | -0.32 |
| 24 | Changes of working hours or work arrangements | 1.09 | -0.61 | 93.72 | -0.07 |
| 25 | Working time and working time flexibility | 1.01 | -0.63 | 94.73 | -0.08 |
| 26 | Preventing over-indebtedness | 0.85 | -0.69 | 95.58 | -0.16 |
| 27 | Changes in work organisation | 0.85 | -0.69 | 96.43 | 0.00 |
| 28 | Wage flexibility | 0.78 | -0.72 | 97.21 | -0.07 |
| 29 | Provision of services in kind (e.g. food vouch... | 0.62 | -0.77 | 97.83 | -0.16 |
| 30 | Rescue procedures in case of insolvency or ada... | 0.47 | -0.83 | 98.30 | -0.15 |
| 31 | Access to healthcare | 0.47 | -0.83 | 98.77 | 0.00 |
| 32 | Creation of platforms for businesses aimed at ... | 0.39 | -0.86 | 99.16 | -0.08 |
| 33 | Matching / networking | 0.31 | -0.89 | 99.47 | -0.08 |
| 34 | Transfer or redeployment of workers | 0.31 | -0.89 | 99.78 | 0.00 |
| 35 | Changes of management approach | 0.23 | -0.91 | 100.01 | -0.08 |
Std deviation for [actors] is [10.66]
| actors | Relative freq | actors_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | National government | 38.76 | 2.98 | 38.76 | NaN |
| 1 | Company / Companies | 18.58 | 1.00 | 57.34 | -20.18 |
| 2 | Trade unions | 7.63 | -0.07 | 64.97 | -10.95 |
| 3 | Employers' organisations | 7.25 | -0.11 | 72.22 | -0.38 |
| 4 | Social partners jointly | 6.28 | -0.20 | 78.50 | -0.97 |
| 5 | Local / regional government | 5.43 | -0.28 | 83.93 | -0.85 |
| 6 | Public employment service | 4.41 | -0.38 | 88.34 | -1.02 |
| 7 | Social insurance | 4.03 | -0.42 | 92.37 | -0.38 |
| 8 | Other social actors (e.g. NGOs) | 3.05 | -0.52 | 95.42 | -0.98 |
| 9 | Public support service providers | 2.88 | -0.53 | 98.30 | -0.17 |
| 10 | EU (Council, EC, EP) | 1.31 | -0.69 | 99.61 | -1.57 |
| 11 | EU level social partners | 0.38 | -0.78 | 99.99 | -0.93 |
Std deviation for [target_groups] is [3.38]
| target_groups | Relative freq | target_groups_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Employees in standard employment | 15.36 | 3.89 | 15.36 | NaN |
| 1 | Sector specific set of companies | 14.43 | 3.61 | 29.79 | -0.93 |
| 2 | SMEs | 6.97 | 1.37 | 36.76 | -7.46 |
| 3 | Self-employed | 6.53 | 1.24 | 43.29 | -0.44 |
| 4 | Particular professions | 6.41 | 1.21 | 49.70 | -0.12 |
| 5 | Unemployed | 4.79 | 0.72 | 54.49 | -1.62 |
| 6 | One person or microenterprises | 4.73 | 0.70 | 59.22 | -0.06 |
| 7 | Solo-self-employed | 3.48 | 0.33 | 62.70 | -1.25 |
| 8 | Other businesses | 3.11 | 0.22 | 65.81 | -0.37 |
| 9 | Other groups of workers | 3.11 | 0.22 | 68.92 | 0.00 |
| 10 | Companies providing essential services | 2.74 | 0.11 | 71.66 | -0.37 |
| 11 | Larger corporations | 2.74 | 0.11 | 74.40 | 0.00 |
| 12 | Workers in essential services | 2.61 | 0.07 | 77.01 | -0.13 |
| 13 | Workers in non-standard forms of employment | 2.61 | 0.07 | 79.62 | 0.00 |
| 14 | Workers in care facilities | 1.80 | -0.17 | 81.42 | -0.81 |
| 15 | Other groups of citizens | 1.68 | -0.21 | 83.10 | -0.12 |
| 16 | Parents in employment | 1.68 | -0.21 | 84.78 | 0.00 |
| 17 | Disabled workers | 1.68 | -0.21 | 86.46 | 0.00 |
| 18 | Disabled | 1.62 | -0.23 | 88.08 | -0.06 |
| 19 | The COVID-19 risk group at the workplace | 1.43 | -0.28 | 89.51 | -0.19 |
| 20 | Older citizens | 1.24 | -0.34 | 90.75 | -0.19 |
| 21 | Parents | 1.18 | -0.36 | 91.93 | -0.06 |
| 22 | Children (minors) | 1.00 | -0.41 | 92.93 | -0.18 |
| 23 | Start-ups | 1.00 | -0.41 | 93.93 | 0.00 |
| 24 | The COVID-19 risk group | 0.93 | -0.43 | 94.86 | -0.07 |
| 25 | Youth (18-25) in employment | 0.75 | -0.49 | 95.61 | -0.18 |
| 26 | Seasonal workers | 0.75 | -0.49 | 96.36 | 0.00 |
| 27 | Youth (18-25) | 0.62 | -0.53 | 96.98 | -0.13 |
| 28 | People in care facilities | 0.44 | -0.58 | 97.42 | -0.18 |
| 29 | Migrants in employment | 0.37 | -0.60 | 97.79 | -0.07 |
| 30 | Older people in employment (aged 55+) | 0.37 | -0.60 | 98.16 | 0.00 |
| 31 | Posted workers | 0.25 | -0.64 | 98.41 | -0.12 |
| 32 | Single parents | 0.25 | -0.64 | 98.66 | 0.00 |
| 33 | Cross-border commuters | 0.25 | -0.64 | 98.91 | 0.00 |
| 34 | Single parents in employment | 0.25 | -0.64 | 99.16 | 0.00 |
| 35 | Migrants | 0.19 | -0.66 | 99.35 | -0.06 |
| 36 | Contractors of a company | 0.19 | -0.66 | 99.54 | 0.00 |
| 37 | Women | 0.12 | -0.68 | 99.66 | -0.07 |
| 38 | Platform workers | 0.12 | -0.68 | 99.78 | 0.00 |
| 39 | Undeclared workers | 0.12 | -0.68 | 99.90 | 0.00 |
| 40 | Refugees | 0.06 | -0.70 | 99.96 | -0.06 |
| 41 | Female workers | 0.06 | -0.70 | 100.02 | 0.00 |
Std deviation for [Businesses] is [20.53]
| Businesses | Relative freq | Businesses_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Businesses_False | 64.52 | 1.0 | 64.52 | NaN |
| 1 | Businesses_True | 35.48 | -1.0 | 100.00 | -29.04 |
Std deviation for [Citizens] is [60.94]
| Citizens | Relative freq | Citizens_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Citizens_False | 93.09 | 1.0 | 93.09 | NaN |
| 1 | Citizens_True | 6.91 | -1.0 | 100.00 | -86.18 |
Std deviation for [Workers] is [5.81]
| Workers | Relative freq | Workers_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Workers_False | 54.11 | 1.0 | 54.11 | NaN |
| 1 | Workers_True | 45.89 | -1.0 | 100.00 | -8.22 |
Std deviation for [funding] is [13.5]
| funding | Relative freq | funding_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | National funds | 47.79 | 3.05 | 47.79 | NaN |
| 1 | No special funding required | 18.22 | 0.76 | 66.01 | -29.57 |
| 2 | Companies | 10.21 | 0.15 | 76.22 | -8.01 |
| 3 | European Funds | 8.76 | 0.03 | 84.98 | -1.45 |
| 4 | Employer | 4.48 | -0.30 | 89.46 | -4.28 |
| 5 | Regional funds | 3.34 | -0.39 | 92.80 | -1.14 |
| 6 | Local funds | 1.70 | -0.51 | 94.50 | -1.64 |
| 7 | Other | 1.70 | -0.51 | 96.20 | 0.00 |
| 8 | Employers organisation | 1.58 | -0.52 | 97.78 | -0.12 |
| 9 | Employees | 1.13 | -0.56 | 98.91 | -0.45 |
| 10 | Trade union | 0.88 | -0.58 | 99.79 | -0.25 |
| 11 | Social partners jointly | 0.19 | -0.63 | 99.98 | -0.69 |
def confidence_interval_with_mean( series : pd.Series):
se = tmp_s.std()/np.sqrt(series.size)
mean = tmp_s.mean()
z = 1.96
max = __builtins__.max
min = __builtins__.min
left_limit = max(round(100*(mean - z*se),2),0)
right_limit = min(round(100*(mean + z*se),2),100)
return [left_limit,right_limit]
def confidence_interval_for_proportion( series : pd.Series):
conf_int = [ list(sm.stats.proportion_confint(n*p, n)) for p in series]
conf_int = pd.DataFrame(conf_int).apply(lambda x: round(100*x,2))
conf_int = [pd.Interval(row[0],row[1],closed='both') for index,row in conf_int.iterrows()]
return conf_int
def z_score_for_series(series : pd.Series):
return pd.Series(stats.zscore(series)).apply(lambda x: round(x,2))
for key in eda_result.keys():
data = eda_result[key].copy()
n = data.size
tmp_s = data[data.columns[1]].copy()
tmp_s/=100
ci_mean = confidence_interval_with_mean(tmp_s)
display(Markdown(f"Confidence Interval for {key} is : [{ci_mean[0]}%, {ci_mean[1]}%]"))
data["Confidence Interval"]= confidence_interval_for_proportion(tmp_s)
data["z_score"] = z_score_for_series(tmp_s)
display(data)
display(Markdown(f"Overrepresented records from column : {key}"))
rel_f = 'Relative freq'
display(data.loc[data[rel_f]>ci_mean[1]])
display(Markdown(f"Normal represented records from column : {key}"))
display(data.loc[(data[rel_f]>=ci_mean[0])&(data[rel_f]<=ci_mean[1])])
display(Markdown(f"Underrepresented records from column : {key}"))
display(data.loc[data[rel_f]<ci_mean[0]])
Confidence Interval for country is : [2.93%, 3.74%]
| country | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Spain | 6.99 | [0.54, 13.44] | 3.30 |
| 1 | Italy | 5.43 | [0.0, 11.16] | 1.89 |
| 2 | Greece | 4.74 | [0.0, 10.12] | 1.27 |
| 3 | Germany | 4.66 | [0.0, 9.99] | 1.20 |
| 4 | Austria | 4.58 | [0.0, 9.87] | 1.13 |
| 5 | Portugal | 4.50 | [0.0, 9.75] | 1.05 |
| 6 | Lithuania | 3.57 | [0.0, 8.26] | 0.21 |
| 7 | France | 3.57 | [0.0, 8.26] | 0.21 |
| 8 | Czechia | 3.49 | [0.0, 8.13] | 0.14 |
| 9 | Norway | 3.42 | [0.0, 8.02] | 0.08 |
| 10 | Ireland | 3.42 | [0.0, 8.02] | 0.08 |
| 11 | Croatia | 3.34 | [0.0, 7.89] | 0.01 |
| 12 | Belgium | 3.34 | [0.0, 7.89] | 0.01 |
| 13 | Poland | 3.26 | [0.0, 7.75] | -0.07 |
| 14 | Finland | 3.11 | [0.0, 7.5] | -0.20 |
| 15 | Luxembourg | 3.03 | [0.0, 7.37] | -0.27 |
| 16 | Netherlands | 3.03 | [0.0, 7.37] | -0.27 |
| 17 | Malta | 3.03 | [0.0, 7.37] | -0.27 |
| 18 | Denmark | 2.95 | [0.0, 7.23] | -0.35 |
| 19 | Slovenia | 2.95 | [0.0, 7.23] | -0.35 |
| 20 | Cyprus | 2.87 | [0.0, 7.09] | -0.42 |
| 21 | Latvia | 2.87 | [0.0, 7.09] | -0.42 |
| 22 | Romania | 2.72 | [0.0, 6.84] | -0.55 |
| 23 | Slovakia | 2.72 | [0.0, 6.84] | -0.55 |
| 24 | United Kingdom | 2.56 | [0.0, 6.56] | -0.70 |
| 25 | Sweden | 2.41 | [0.0, 6.29] | -0.83 |
| 26 | Bulgaria | 2.25 | [0.0, 6.0] | -0.98 |
| 27 | Hungary | 1.94 | [0.0, 5.43] | -1.26 |
| 28 | European Union | 1.71 | [0.0, 4.99] | -1.47 |
| 29 | Estonia | 1.55 | [0.0, 4.68] | -1.61 |
Overrepresented records from column : country
| country | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Spain | 6.99 | [0.54, 13.44] | 3.30 |
| 1 | Italy | 5.43 | [0.0, 11.16] | 1.89 |
| 2 | Greece | 4.74 | [0.0, 10.12] | 1.27 |
| 3 | Germany | 4.66 | [0.0, 9.99] | 1.20 |
| 4 | Austria | 4.58 | [0.0, 9.87] | 1.13 |
| 5 | Portugal | 4.50 | [0.0, 9.75] | 1.05 |
Normal represented records from column : country
| country | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 6 | Lithuania | 3.57 | [0.0, 8.26] | 0.21 |
| 7 | France | 3.57 | [0.0, 8.26] | 0.21 |
| 8 | Czechia | 3.49 | [0.0, 8.13] | 0.14 |
| 9 | Norway | 3.42 | [0.0, 8.02] | 0.08 |
| 10 | Ireland | 3.42 | [0.0, 8.02] | 0.08 |
| 11 | Croatia | 3.34 | [0.0, 7.89] | 0.01 |
| 12 | Belgium | 3.34 | [0.0, 7.89] | 0.01 |
| 13 | Poland | 3.26 | [0.0, 7.75] | -0.07 |
| 14 | Finland | 3.11 | [0.0, 7.5] | -0.20 |
| 15 | Luxembourg | 3.03 | [0.0, 7.37] | -0.27 |
| 16 | Netherlands | 3.03 | [0.0, 7.37] | -0.27 |
| 17 | Malta | 3.03 | [0.0, 7.37] | -0.27 |
| 18 | Denmark | 2.95 | [0.0, 7.23] | -0.35 |
| 19 | Slovenia | 2.95 | [0.0, 7.23] | -0.35 |
Underrepresented records from column : country
| country | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 20 | Cyprus | 2.87 | [0.0, 7.09] | -0.42 |
| 21 | Latvia | 2.87 | [0.0, 7.09] | -0.42 |
| 22 | Romania | 2.72 | [0.0, 6.84] | -0.55 |
| 23 | Slovakia | 2.72 | [0.0, 6.84] | -0.55 |
| 24 | United Kingdom | 2.56 | [0.0, 6.56] | -0.70 |
| 25 | Sweden | 2.41 | [0.0, 6.29] | -0.83 |
| 26 | Bulgaria | 2.25 | [0.0, 6.0] | -0.98 |
| 27 | Hungary | 1.94 | [0.0, 5.43] | -1.26 |
| 28 | European Union | 1.71 | [0.0, 4.99] | -1.47 |
| 29 | Estonia | 1.55 | [0.0, 4.68] | -1.61 |
Confidence Interval for category is : [6.1%, 16.12%]
| category | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Supporting businesses to stay afloat | 29.66 | [8.56, 50.76] | 2.57 |
| 1 | Protection of workers, adaptation of workplace | 14.36 | [0.0, 30.56] | 0.45 |
| 2 | Income protection beyond short-time work | 11.65 | [0.0, 26.47] | 0.07 |
| 3 | Promoting the economic, labour market and soci... | 10.09 | [0.0, 24.0] | -0.14 |
| 4 | Employment protection and retention | 9.32 | [0.0, 22.75] | -0.25 |
| 5 | Ensuring business continuity and support for e... | 8.77 | [0.0, 21.84] | -0.32 |
| 6 | Measures to prevent social hardship | 7.14 | [0.0, 19.04] | -0.55 |
| 7 | Reorientation of business activities | 5.59 | [0.0, 16.2] | -0.76 |
| 8 | Supporting businesses to get back to normal | 3.42 | [0.0, 11.82] | -1.06 |
Overrepresented records from column : category
| category | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Supporting businesses to stay afloat | 29.66 | [8.56, 50.76] | 2.57 |
Normal represented records from column : category
| category | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 1 | Protection of workers, adaptation of workplace | 14.36 | [0.0, 30.56] | 0.45 |
| 2 | Income protection beyond short-time work | 11.65 | [0.0, 26.47] | 0.07 |
| 3 | Promoting the economic, labour market and soci... | 10.09 | [0.0, 24.0] | -0.14 |
| 4 | Employment protection and retention | 9.32 | [0.0, 22.75] | -0.25 |
| 5 | Ensuring business continuity and support for e... | 8.77 | [0.0, 21.84] | -0.32 |
| 6 | Measures to prevent social hardship | 7.14 | [0.0, 19.04] | -0.55 |
Underrepresented records from column : category
| category | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 7 | Reorientation of business activities | 5.59 | [0.0, 16.2] | -0.76 |
| 8 | Supporting businesses to get back to normal | 3.42 | [0.0, 11.82] | -1.06 |
Confidence Interval for subcategory is : [1.85%, 3.7%]
| subcategory | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Direct subsidies (full or partial) | 14.36 | [6.26, 22.46] | 4.15 |
| 1 | Access to finance | 8.00 | [1.73, 14.27] | 1.87 |
| 2 | Income support for people in employment (e.g. ... | 7.22 | [1.24, 13.2] | 1.59 |
| 3 | Deferral of payments or liabilities | 5.36 | [0.16, 10.56] | 0.93 |
| 4 | Other | 5.28 | [0.11, 10.45] | 0.90 |
| 5 | Occupational health and safety | 5.12 | [0.03, 10.21] | 0.84 |
| 6 | Active labour market policies, incl. subsidise... | 4.66 | [0.0, 9.53] | 0.68 |
| 7 | Extensions of income support to workers not c... | 4.58 | [0.0, 9.41] | 0.65 |
| 8 | Change of production/innovation | 4.50 | [0.0, 9.29] | 0.62 |
| 9 | Teleworking arrangements, remote working | 4.35 | [0.0, 9.06] | 0.56 |
| 10 | Protection of vulnerable groups (beyond employ... | 2.95 | [0.0, 6.86] | 0.06 |
| 11 | Support for spending, stimulus packages | 2.64 | [0.0, 6.34] | -0.05 |
| 12 | Change of work arrangements (working time, rot... | 2.64 | [0.0, 6.34] | -0.05 |
| 13 | Support for parents and carers (financial or i... | 2.64 | [0.0, 6.34] | -0.05 |
| 14 | Remuneration and rewards for workers in essent... | 2.64 | [0.0, 6.34] | -0.05 |
| 15 | Paid sick leave | 2.33 | [0.0, 5.81] | -0.16 |
| 16 | Well-being of workers | 2.10 | [0.0, 5.41] | -0.24 |
| 17 | Keeping a safe home | 2.02 | [0.0, 5.27] | -0.27 |
| 18 | Income support for unemployed | 1.94 | [0.0, 5.13] | -0.30 |
| 19 | Mobilisation of a larger workforce | 1.63 | [0.0, 4.55] | -0.41 |
| 20 | Enhancing employability and training | 1.55 | [0.0, 4.4] | -0.44 |
| 21 | Smoothing frictions or reallocation of workers | 1.48 | [0.0, 4.27] | -0.47 |
| 22 | Measures to support a gradual relaunch of work | 1.48 | [0.0, 4.27] | -0.47 |
| 23 | Flexibilisation and security | 1.16 | [0.0, 3.63] | -0.58 |
| 24 | Changes of working hours or work arrangements | 1.09 | [0.0, 3.49] | -0.61 |
| 25 | Working time and working time flexibility | 1.01 | [0.0, 3.32] | -0.63 |
| 26 | Preventing over-indebtedness | 0.85 | [0.0, 2.97] | -0.69 |
| 27 | Changes in work organisation | 0.85 | [0.0, 2.97] | -0.69 |
| 28 | Wage flexibility | 0.78 | [0.0, 2.81] | -0.72 |
| 29 | Provision of services in kind (e.g. food vouch... | 0.62 | [0.0, 2.43] | -0.77 |
| 30 | Rescue procedures in case of insolvency or ada... | 0.47 | [0.0, 2.05] | -0.83 |
| 31 | Access to healthcare | 0.47 | [0.0, 2.05] | -0.83 |
| 32 | Creation of platforms for businesses aimed at ... | 0.39 | [0.0, 1.83] | -0.86 |
| 33 | Matching / networking | 0.31 | [0.0, 1.59] | -0.89 |
| 34 | Transfer or redeployment of workers | 0.31 | [0.0, 1.59] | -0.89 |
| 35 | Changes of management approach | 0.23 | [0.0, 1.34] | -0.91 |
Overrepresented records from column : subcategory
| subcategory | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Direct subsidies (full or partial) | 14.36 | [6.26, 22.46] | 4.15 |
| 1 | Access to finance | 8.00 | [1.73, 14.27] | 1.87 |
| 2 | Income support for people in employment (e.g. ... | 7.22 | [1.24, 13.2] | 1.59 |
| 3 | Deferral of payments or liabilities | 5.36 | [0.16, 10.56] | 0.93 |
| 4 | Other | 5.28 | [0.11, 10.45] | 0.90 |
| 5 | Occupational health and safety | 5.12 | [0.03, 10.21] | 0.84 |
| 6 | Active labour market policies, incl. subsidise... | 4.66 | [0.0, 9.53] | 0.68 |
| 7 | Extensions of income support to workers not c... | 4.58 | [0.0, 9.41] | 0.65 |
| 8 | Change of production/innovation | 4.50 | [0.0, 9.29] | 0.62 |
| 9 | Teleworking arrangements, remote working | 4.35 | [0.0, 9.06] | 0.56 |
Normal represented records from column : subcategory
| subcategory | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 10 | Protection of vulnerable groups (beyond employ... | 2.95 | [0.0, 6.86] | 0.06 |
| 11 | Support for spending, stimulus packages | 2.64 | [0.0, 6.34] | -0.05 |
| 12 | Change of work arrangements (working time, rot... | 2.64 | [0.0, 6.34] | -0.05 |
| 13 | Support for parents and carers (financial or i... | 2.64 | [0.0, 6.34] | -0.05 |
| 14 | Remuneration and rewards for workers in essent... | 2.64 | [0.0, 6.34] | -0.05 |
| 15 | Paid sick leave | 2.33 | [0.0, 5.81] | -0.16 |
| 16 | Well-being of workers | 2.10 | [0.0, 5.41] | -0.24 |
| 17 | Keeping a safe home | 2.02 | [0.0, 5.27] | -0.27 |
| 18 | Income support for unemployed | 1.94 | [0.0, 5.13] | -0.30 |
Underrepresented records from column : subcategory
| subcategory | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 19 | Mobilisation of a larger workforce | 1.63 | [0.0, 4.55] | -0.41 |
| 20 | Enhancing employability and training | 1.55 | [0.0, 4.4] | -0.44 |
| 21 | Smoothing frictions or reallocation of workers | 1.48 | [0.0, 4.27] | -0.47 |
| 22 | Measures to support a gradual relaunch of work | 1.48 | [0.0, 4.27] | -0.47 |
| 23 | Flexibilisation and security | 1.16 | [0.0, 3.63] | -0.58 |
| 24 | Changes of working hours or work arrangements | 1.09 | [0.0, 3.49] | -0.61 |
| 25 | Working time and working time flexibility | 1.01 | [0.0, 3.32] | -0.63 |
| 26 | Preventing over-indebtedness | 0.85 | [0.0, 2.97] | -0.69 |
| 27 | Changes in work organisation | 0.85 | [0.0, 2.97] | -0.69 |
| 28 | Wage flexibility | 0.78 | [0.0, 2.81] | -0.72 |
| 29 | Provision of services in kind (e.g. food vouch... | 0.62 | [0.0, 2.43] | -0.77 |
| 30 | Rescue procedures in case of insolvency or ada... | 0.47 | [0.0, 2.05] | -0.83 |
| 31 | Access to healthcare | 0.47 | [0.0, 2.05] | -0.83 |
| 32 | Creation of platforms for businesses aimed at ... | 0.39 | [0.0, 1.83] | -0.86 |
| 33 | Matching / networking | 0.31 | [0.0, 1.59] | -0.89 |
| 34 | Transfer or redeployment of workers | 0.31 | [0.0, 1.59] | -0.89 |
| 35 | Changes of management approach | 0.23 | [0.0, 1.34] | -0.91 |
Confidence Interval for actors is : [2.3%, 14.36%]
| actors | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | National government | 38.76 | [19.27, 58.25] | 2.98 |
| 1 | Company / Companies | 18.58 | [3.02, 34.14] | 1.00 |
| 2 | Trade unions | 7.63 | [0.0, 18.25] | -0.07 |
| 3 | Employers' organisations | 7.25 | [0.0, 17.62] | -0.11 |
| 4 | Social partners jointly | 6.28 | [0.0, 15.99] | -0.20 |
| 5 | Local / regional government | 5.43 | [0.0, 14.5] | -0.28 |
| 6 | Public employment service | 4.41 | [0.0, 12.62] | -0.38 |
| 7 | Social insurance | 4.03 | [0.0, 11.9] | -0.42 |
| 8 | Other social actors (e.g. NGOs) | 3.05 | [0.0, 9.93] | -0.52 |
| 9 | Public support service providers | 2.88 | [0.0, 9.57] | -0.53 |
| 10 | EU (Council, EC, EP) | 1.31 | [0.0, 5.86] | -0.69 |
| 11 | EU level social partners | 0.38 | [0.0, 2.84] | -0.78 |
Overrepresented records from column : actors
| actors | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | National government | 38.76 | [19.27, 58.25] | 2.98 |
| 1 | Company / Companies | 18.58 | [3.02, 34.14] | 1.00 |
Normal represented records from column : actors
| actors | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 2 | Trade unions | 7.63 | [0.0, 18.25] | -0.07 |
| 3 | Employers' organisations | 7.25 | [0.0, 17.62] | -0.11 |
| 4 | Social partners jointly | 6.28 | [0.0, 15.99] | -0.20 |
| 5 | Local / regional government | 5.43 | [0.0, 14.5] | -0.28 |
| 6 | Public employment service | 4.41 | [0.0, 12.62] | -0.38 |
| 7 | Social insurance | 4.03 | [0.0, 11.9] | -0.42 |
| 8 | Other social actors (e.g. NGOs) | 3.05 | [0.0, 9.93] | -0.52 |
| 9 | Public support service providers | 2.88 | [0.0, 9.57] | -0.53 |
Underrepresented records from column : actors
| actors | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 10 | EU (Council, EC, EP) | 1.31 | [0.0, 5.86] | -0.69 |
| 11 | EU level social partners | 0.38 | [0.0, 2.84] | -0.78 |
Confidence Interval for target_groups is : [1.36%, 3.4%]
| target_groups | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Employees in standard employment | 15.36 | [7.65, 23.07] | 3.89 |
| 1 | Sector specific set of companies | 14.43 | [6.92, 21.94] | 3.61 |
| 2 | SMEs | 6.97 | [1.52, 12.42] | 1.37 |
| 3 | Self-employed | 6.53 | [1.25, 11.81] | 1.24 |
| 4 | Particular professions | 6.41 | [1.17, 11.65] | 1.21 |
| 5 | Unemployed | 4.79 | [0.22, 9.36] | 0.72 |
| 6 | One person or microenterprises | 4.73 | [0.19, 9.27] | 0.70 |
| 7 | Solo-self-employed | 3.48 | [0.0, 7.4] | 0.33 |
| 8 | Other businesses | 3.11 | [0.0, 6.82] | 0.22 |
| 9 | Other groups of workers | 3.11 | [0.0, 6.82] | 0.22 |
| 10 | Companies providing essential services | 2.74 | [0.0, 6.23] | 0.11 |
| 11 | Larger corporations | 2.74 | [0.0, 6.23] | 0.11 |
| 12 | Workers in essential services | 2.61 | [0.0, 6.02] | 0.07 |
| 13 | Workers in non-standard forms of employment | 2.61 | [0.0, 6.02] | 0.07 |
| 14 | Workers in care facilities | 1.80 | [0.0, 4.64] | -0.17 |
| 15 | Other groups of citizens | 1.68 | [0.0, 4.43] | -0.21 |
| 16 | Parents in employment | 1.68 | [0.0, 4.43] | -0.21 |
| 17 | Disabled workers | 1.68 | [0.0, 4.43] | -0.21 |
| 18 | Disabled | 1.62 | [0.0, 4.32] | -0.23 |
| 19 | The COVID-19 risk group at the workplace | 1.43 | [0.0, 3.97] | -0.28 |
| 20 | Older citizens | 1.24 | [0.0, 3.61] | -0.34 |
| 21 | Parents | 1.18 | [0.0, 3.49] | -0.36 |
| 22 | Children (minors) | 1.00 | [0.0, 3.13] | -0.41 |
| 23 | Start-ups | 1.00 | [0.0, 3.13] | -0.41 |
| 24 | The COVID-19 risk group | 0.93 | [0.0, 2.98] | -0.43 |
| 25 | Youth (18-25) in employment | 0.75 | [0.0, 2.6] | -0.49 |
| 26 | Seasonal workers | 0.75 | [0.0, 2.6] | -0.49 |
| 27 | Youth (18-25) | 0.62 | [0.0, 2.3] | -0.53 |
| 28 | People in care facilities | 0.44 | [0.0, 1.86] | -0.58 |
| 29 | Migrants in employment | 0.37 | [0.0, 1.67] | -0.60 |
| 30 | Older people in employment (aged 55+) | 0.37 | [0.0, 1.67] | -0.60 |
| 31 | Posted workers | 0.25 | [0.0, 1.32] | -0.64 |
| 32 | Single parents | 0.25 | [0.0, 1.32] | -0.64 |
| 33 | Cross-border commuters | 0.25 | [0.0, 1.32] | -0.64 |
| 34 | Single parents in employment | 0.25 | [0.0, 1.32] | -0.64 |
| 35 | Migrants | 0.19 | [0.0, 1.12] | -0.66 |
| 36 | Contractors of a company | 0.19 | [0.0, 1.12] | -0.66 |
| 37 | Women | 0.12 | [0.0, 0.86] | -0.68 |
| 38 | Platform workers | 0.12 | [0.0, 0.86] | -0.68 |
| 39 | Undeclared workers | 0.12 | [0.0, 0.86] | -0.68 |
| 40 | Refugees | 0.06 | [0.0, 0.58] | -0.70 |
| 41 | Female workers | 0.06 | [0.0, 0.58] | -0.70 |
Overrepresented records from column : target_groups
| target_groups | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Employees in standard employment | 15.36 | [7.65, 23.07] | 3.89 |
| 1 | Sector specific set of companies | 14.43 | [6.92, 21.94] | 3.61 |
| 2 | SMEs | 6.97 | [1.52, 12.42] | 1.37 |
| 3 | Self-employed | 6.53 | [1.25, 11.81] | 1.24 |
| 4 | Particular professions | 6.41 | [1.17, 11.65] | 1.21 |
| 5 | Unemployed | 4.79 | [0.22, 9.36] | 0.72 |
| 6 | One person or microenterprises | 4.73 | [0.19, 9.27] | 0.70 |
| 7 | Solo-self-employed | 3.48 | [0.0, 7.4] | 0.33 |
Normal represented records from column : target_groups
| target_groups | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 8 | Other businesses | 3.11 | [0.0, 6.82] | 0.22 |
| 9 | Other groups of workers | 3.11 | [0.0, 6.82] | 0.22 |
| 10 | Companies providing essential services | 2.74 | [0.0, 6.23] | 0.11 |
| 11 | Larger corporations | 2.74 | [0.0, 6.23] | 0.11 |
| 12 | Workers in essential services | 2.61 | [0.0, 6.02] | 0.07 |
| 13 | Workers in non-standard forms of employment | 2.61 | [0.0, 6.02] | 0.07 |
| 14 | Workers in care facilities | 1.80 | [0.0, 4.64] | -0.17 |
| 15 | Other groups of citizens | 1.68 | [0.0, 4.43] | -0.21 |
| 16 | Parents in employment | 1.68 | [0.0, 4.43] | -0.21 |
| 17 | Disabled workers | 1.68 | [0.0, 4.43] | -0.21 |
| 18 | Disabled | 1.62 | [0.0, 4.32] | -0.23 |
| 19 | The COVID-19 risk group at the workplace | 1.43 | [0.0, 3.97] | -0.28 |
Underrepresented records from column : target_groups
| target_groups | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 20 | Older citizens | 1.24 | [0.0, 3.61] | -0.34 |
| 21 | Parents | 1.18 | [0.0, 3.49] | -0.36 |
| 22 | Children (minors) | 1.00 | [0.0, 3.13] | -0.41 |
| 23 | Start-ups | 1.00 | [0.0, 3.13] | -0.41 |
| 24 | The COVID-19 risk group | 0.93 | [0.0, 2.98] | -0.43 |
| 25 | Youth (18-25) in employment | 0.75 | [0.0, 2.6] | -0.49 |
| 26 | Seasonal workers | 0.75 | [0.0, 2.6] | -0.49 |
| 27 | Youth (18-25) | 0.62 | [0.0, 2.3] | -0.53 |
| 28 | People in care facilities | 0.44 | [0.0, 1.86] | -0.58 |
| 29 | Migrants in employment | 0.37 | [0.0, 1.67] | -0.60 |
| 30 | Older people in employment (aged 55+) | 0.37 | [0.0, 1.67] | -0.60 |
| 31 | Posted workers | 0.25 | [0.0, 1.32] | -0.64 |
| 32 | Single parents | 0.25 | [0.0, 1.32] | -0.64 |
| 33 | Cross-border commuters | 0.25 | [0.0, 1.32] | -0.64 |
| 34 | Single parents in employment | 0.25 | [0.0, 1.32] | -0.64 |
| 35 | Migrants | 0.19 | [0.0, 1.12] | -0.66 |
| 36 | Contractors of a company | 0.19 | [0.0, 1.12] | -0.66 |
| 37 | Women | 0.12 | [0.0, 0.86] | -0.68 |
| 38 | Platform workers | 0.12 | [0.0, 0.86] | -0.68 |
| 39 | Undeclared workers | 0.12 | [0.0, 0.86] | -0.68 |
| 40 | Refugees | 0.06 | [0.0, 0.58] | -0.70 |
| 41 | Female workers | 0.06 | [0.0, 0.58] | -0.70 |
Confidence Interval for Businesses is : [21.54%, 78.46%]
| Businesses | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Businesses_False | 64.52 | [17.63, 100.0] | 1.0 |
| 1 | Businesses_True | 35.48 | [0.0, 82.37] | -1.0 |
Overrepresented records from column : Businesses
| Businesses | Relative freq | Confidence Interval | z_score |
|---|
Normal represented records from column : Businesses
| Businesses | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Businesses_False | 64.52 | [17.63, 100.0] | 1.0 |
| 1 | Businesses_True | 35.48 | [0.0, 82.37] | -1.0 |
Underrepresented records from column : Businesses
| Businesses | Relative freq | Confidence Interval | z_score |
|---|
Confidence Interval for Citizens is : [0%, 100%]
| Citizens | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Citizens_False | 93.09 | [68.24, 100.0] | 1.0 |
| 1 | Citizens_True | 6.91 | [0.0, 31.76] | -1.0 |
Overrepresented records from column : Citizens
| Citizens | Relative freq | Confidence Interval | z_score |
|---|
Normal represented records from column : Citizens
| Citizens | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Citizens_False | 93.09 | [68.24, 100.0] | 1.0 |
| 1 | Citizens_True | 6.91 | [0.0, 31.76] | -1.0 |
Underrepresented records from column : Citizens
| Citizens | Relative freq | Confidence Interval | z_score |
|---|
Confidence Interval for Workers is : [41.94%, 58.06%]
| Workers | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Workers_False | 54.11 | [5.28, 100.0] | 1.0 |
| 1 | Workers_True | 45.89 | [0.0, 94.72] | -1.0 |
Overrepresented records from column : Workers
| Workers | Relative freq | Confidence Interval | z_score |
|---|
Normal represented records from column : Workers
| Workers | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Workers_False | 54.11 | [5.28, 100.0] | 1.0 |
| 1 | Workers_True | 45.89 | [0.0, 94.72] | -1.0 |
Underrepresented records from column : Workers
| Workers | Relative freq | Confidence Interval | z_score |
|---|
Confidence Interval for funding is : [0.69%, 15.97%]
| funding | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | National funds | 47.79 | [27.81, 67.77] | 3.05 |
| 1 | No special funding required | 18.22 | [2.78, 33.66] | 0.76 |
| 2 | Companies | 10.21 | [0.0, 22.32] | 0.15 |
| 3 | European Funds | 8.76 | [0.0, 20.07] | 0.03 |
| 4 | Employer | 4.48 | [0.0, 12.76] | -0.30 |
| 5 | Regional funds | 3.34 | [0.0, 10.53] | -0.39 |
| 6 | Local funds | 1.70 | [0.0, 6.87] | -0.51 |
| 7 | Other | 1.70 | [0.0, 6.87] | -0.51 |
| 8 | Employers organisation | 1.58 | [0.0, 6.57] | -0.52 |
| 9 | Employees | 1.13 | [0.0, 5.36] | -0.56 |
| 10 | Trade union | 0.88 | [0.0, 4.62] | -0.58 |
| 11 | Social partners jointly | 0.19 | [0.0, 1.93] | -0.63 |
Overrepresented records from column : funding
| funding | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | National funds | 47.79 | [27.81, 67.77] | 3.05 |
| 1 | No special funding required | 18.22 | [2.78, 33.66] | 0.76 |
Normal represented records from column : funding
| funding | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 2 | Companies | 10.21 | [0.0, 22.32] | 0.15 |
| 3 | European Funds | 8.76 | [0.0, 20.07] | 0.03 |
| 4 | Employer | 4.48 | [0.0, 12.76] | -0.30 |
| 5 | Regional funds | 3.34 | [0.0, 10.53] | -0.39 |
| 6 | Local funds | 1.70 | [0.0, 6.87] | -0.51 |
| 7 | Other | 1.70 | [0.0, 6.87] | -0.51 |
| 8 | Employers organisation | 1.58 | [0.0, 6.57] | -0.52 |
| 9 | Employees | 1.13 | [0.0, 5.36] | -0.56 |
| 10 | Trade union | 0.88 | [0.0, 4.62] | -0.58 |
Underrepresented records from column : funding
| funding | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 11 | Social partners jointly | 0.19 | [0.0, 1.93] | -0.63 |
CLASS_COLUMNS = ['category', 'subcategory','actors','target_groups','Businesses','Citizens','Workers', 'funding']
def convert_to_binary_matrix(data : pd.DataFrame):
binary_matrix = pd.DataFrame([],dtype=object)
for index,row in data.iterrows():
new_row = {}
for key in row.index:
if type(row[key]) == list:
for column in row[key]:
new_row[column] = 1
else:
new_row[row[key]] = 1
binary_matrix = binary_matrix.append(new_row,ignore_index=True)
binary_matrix = binary_matrix.fillna(0)
return binary_matrix
def dependency_table( data : pd.DataFrame, dependecy_level : float = 0.9):
result = {}
for column in data.columns:
tmp = data.loc[data[column]==1].copy()
tmp = tmp.sum()
tmp /= tmp[column]
tmp = tmp.drop(column)
tmp = tmp.loc[tmp.values>=dependecy_level]
new_row = {}
if tmp.size>0:
for index in tmp.index:
new_row[index] = tmp[index]
result[column] = new_row
return pd.DataFrame(result).fillna(0)
def class_collision_in_columns(data : pd.DataFrame):
for column in data.columns:
series = data[column]
tmp_df = pd.DataFrame({column : series.values})
btable = convert_to_binary_matrix(tmp_df)
dtable = dependency_table(btable,0.7)
if dtable.size>0:
display(Markdown(f"Collision in column : {column}"))
display(px.imshow(dtable))
class_collision_in_columns(df[CLASS_COLUMNS])
Collision in column : actors
Collision in column : target_groups
Collision in column : funding
def class_collision(data : pd.DataFrame):
bmat = convert_to_binary_matrix(data)
dtable = dependency_table(bmat,0.7)
display(Markdown("Collision in dataframe"))
display(px.imshow(dtable,aspect='auto'))
class_collision(df[CLASS_COLUMNS])
Collision in dataframe